library(tidycensus)
library(tidyverse)
library(segregation)
library(tigris)
library(sf)
# Get California tract data by race/ethnicity
ca_acs_data <- get_acs(
geography = "tract",
variables = c(
white = "B03002_003",
black = "B03002_004",
asian = "B03002_006",
hispanic = "B03002_012"
),
state = "CA",
geometry = TRUE,
year = 2019
)
# Use tidycensus to get urbanized areas by population with geometry,
# then filter for those that have populations of 750,000 or more
us_urban_areas <- get_acs(
geography = "urban area",
variables = "B01001_001",
geometry = TRUE,
year = 2019,
survey = "acs1"
) %>%
filter(estimate >= 750000) %>%
transmute(urban_name = str_remove(NAME,
fixed(", CA Urbanized Area (2010)")))
# Compute an inner spatial join between the California tracts and the
# urbanized areas, returning tracts in the largest California urban
# areas with the urban_name column appended
ca_urban_data <- ca_acs_data %>%
st_join(us_urban_areas, left = FALSE) %>% # join = st_intersects by default
select(-NAME) %>%
st_drop_geometry()Modeling US Census data
R25 Modelers and Story Tellers
1 Introduction
In this tutorial, we learn to:
perform segregation analysis
regression analysis
Resources: Analyzing US Census Data Chapter 8 and R for Data Science.
2 Segregation and diversity
2.1 White-Hispanic dissimilarity index
Dissimilarity index \[ D = \frac 12 \sum_{i=1}^N | \frac{a_i}{A} - \frac{b_i}{B}|, \] where \(a_i\) represents the population of group \(A\) in a given areal unit \(i\); A is the total population of that group in the study region (e.g. a metropolitan area); and \(b_i\) and \(B\) are the equivalent metrics for the second group. \(D=0\) indicates perfect integration and \(D=1\) means complete segregation.
Dissimilarity in the LA metro area:
ca_urban_data %>%
filter(variable %in% c("white", "hispanic"),
urban_name == "Los Angeles--Long Beach--Anaheim") %>%
dissimilarity(
group = "variable", # race groups
unit = "GEOID", # geographic regions
weight = "estimate"
) stat est
1: D 0.5999229
Compute the dissimilarity for each urban areas in California.
ca_urban_data %>%
filter(variable %in% c("white", "hispanic")) %>%
group_by(urban_name) %>%
group_modify(~
dissimilarity(.x,
group = "variable",
unit = "GEOID",
weight = "estimate"
)
) %>%
arrange(desc(est)) %>%
print()# A tibble: 6 × 3
# Groups: urban_name [6]
urban_name stat est
<chr> <chr> <dbl>
1 Los Angeles--Long Beach--Anaheim D 0.600
2 San Francisco--Oakland D 0.514
3 San Jose D 0.494
4 San Diego D 0.490
5 Riverside--San Bernardino D 0.408
6 Sacramento D 0.369
2.2 Multi-group segregation indices
For a data set \(T\), - Mutual information index \[ M(T) = \sum_{u=1}^U \sum_{g=1}^G p_{ug} \log \frac{p_{ug}}{p_u p_g}. \] - Theil’s \(H\) is \[ H(T) = \frac{M(T)}{E(T)}, \] where \(E(T)\) is the entropy of \(T\), normalizing \(H\) to be between 0 and 1.
mutual_within(
data = ca_urban_data,
group = "variable",
unit = "GEOID",
weight = "estimate",
within = "urban_name",
wide = TRUE
) urban_name M p H ent_ratio
1: Los Angeles--Long Beach--Anaheim 0.3391033 0.50163709 0.2851662 0.9693226
2: Riverside--San Bernardino 0.1497129 0.08678082 0.1408461 0.8664604
3: Sacramento 0.1658898 0.07369482 0.1426804 0.9477412
4: San Diego 0.2290891 0.12560720 0.2025728 0.9218445
5: San Francisco--Oakland 0.2685992 0.13945223 0.2116127 1.0346590
6: San Jose 0.2147445 0.07282785 0.1829190 0.9569681
2.3 Local segregation analysis
Patterns of segregation across the most segregated urban area, Los Angeles:
la_local_seg <- ca_urban_data %>%
filter(urban_name == "Los Angeles--Long Beach--Anaheim") %>%
mutual_local(
group = "variable",
unit = "GEOID",
weight = "estimate",
wide = TRUE
) %>%
print() GEOID ls p
1: 06037101110 0.28218462 0.0003362648
2: 06037101122 0.77904804 0.0002689957
3: 06037101210 0.10121933 0.0005088495
4: 06037101220 0.11823338 0.0002917425
5: 06037101300 0.65382196 0.0003093895
---
2769: 06071012700 0.05912488 0.0003110085
2770: 06111007405 0.49522902 0.0004648128
2771: 06111007511 0.43635422 0.0001729084
2772: 06111008304 0.32087536 0.0004579321
2773: 06111008305 0.36332355 0.0002955471
la_tracts_seg <- tracts("CA", cb = TRUE, year = 2019) %>%
inner_join(la_local_seg, by = "GEOID")
la_tracts_seg %>%
ggplot(aes(fill = ls)) +
geom_sf(color = NA) +
coord_sf(crs = 26946) +
scale_fill_viridis_c(option = "inferno") +
theme_void() +
labs(fill = "Local\nsegregation index")3 Regression modeling (Los Angeles)
Median home value by Census tract in the Los Angeles metropolitan area:
library(tidycensus)
library(sf)
variables_to_get <- c(
median_value = "B25077_001",
median_rooms = "B25018_001",
median_income = "DP03_0062",
total_population = "B01003_001",
median_age = "B01002_001",
pct_college = "DP02_0068P",
pct_foreign_born = "DP02_0094P",
pct_white = "DP05_0077P",
median_year_built = "B25037_001",
percent_ooh = "DP04_0046P"
)
lametro_data <- get_acs(
geography = "tract",
variables = variables_to_get,
state = "CA",
county = c("Los Angeles", "Orange"),
geometry = TRUE,
output = "wide",
year = 2020
) %>%
select(-NAME) %>%
# slice(-c(572, 1939)) %>%
st_transform(4267) %>%
print(width = Inf)Simple feature collection with 3112 features and 21 fields (with 4 geometries empty)
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -118.9437 ymin: 32.80142 xmax: -117.4124 ymax: 34.82331
Geodetic CRS: NAD27
# A tibble: 3,112 × 22
GEOID median_valueE median_valueM median_roomsE median_roomsM
* <chr> <dbl> <dbl> <dbl> <dbl>
1 06037101110 502200 51606 4.6 0.3
2 06037101122 658800 21673 5.7 0.5
3 06037101220 550800 39781 4.1 0.3
4 06037101221 461600 75638 3.3 0.6
5 06037101222 378400 93996 3.7 0.3
6 06037101300 643200 24681 5.3 0.3
7 06037101400 554300 55738 5.2 0.5
8 06037102103 658200 50657 5 0.3
9 06037102104 630200 44747 5.5 0.4
10 06037102105 545400 71790 4.6 0.4
total_populationE total_populationM median_ageE median_ageM
* <dbl> <dbl> <dbl> <dbl>
1 3923 460 44.3 2.6
2 4119 858 52.2 2.6
3 3775 474 42.2 5
4 3787 651 40.5 6.1
5 2717 442 39.3 3
6 3741 478 52.6 3.9
7 3246 534 46.9 5.7
8 1920 495 37.7 2.8
9 4187 612 45.8 7.3
10 1797 336 42.2 4.7
median_year_builtE median_year_builtM median_incomeE median_incomeM
* <dbl> <dbl> <dbl> <dbl>
1 1958 3 74625 27799
2 1964 3 93125 15209
3 1959 5 55682 9264
4 1975 5 46274 13226
5 1976 9 30016 9714
6 1958 2 87066 27204
7 1962 5 66210 28061
8 1958 3 59005 15907
9 1973 2 98973 19597
10 1952 6 82438 15646
pct_collegeE pct_collegeM pct_foreign_bornE pct_foreign_bornM pct_whiteE
* <dbl> <dbl> <dbl> <dbl> <dbl>
1 27.8 5.2 34.8 6.4 61
2 34.1 9.8 34.6 9.9 76.7
3 26.2 7.2 44.4 8.7 43.9
4 24 9.8 50.9 9.2 53.5
5 17.1 7.4 55.7 9.6 54
6 33.2 7.1 45 6.4 80.4
7 30.9 6.4 34.6 7.5 69.5
8 35.7 8.1 42.8 7.9 75
9 41 8.5 37.1 7.6 68.8
10 28.4 7.1 36.4 7.8 23
pct_whiteM percent_oohE percent_oohM
* <dbl> <dbl> <dbl>
1 7.7 58.3 7.6
2 8.1 74.9 10.4
3 7.4 42.7 7.9
4 10.4 20.4 9.2
5 13.3 11.4 8.3
6 6.9 86.8 5.6
7 8.4 69.8 9.5
8 9.7 57.9 14.6
9 8.6 78.4 7.5
10 5.6 55.8 11.9
geometry
* <MULTIPOLYGON [°]>
1 (((-118.2999 34.25961, -118.2999 34.26321, -118.297 34.26322, -118.2904 34.2…
2 (((-118.3024 34.27381, -118.2988 34.27526, -118.2989 34.2767, -118.2965 34.2…
3 (((-118.285 34.25227, -118.285 34.25589, -118.2841 34.25589, -118.2773 34.25…
4 (((-118.2986 34.25598, -118.2923 34.25594, -118.2901 34.25593, -118.2864 34.…
5 (((-118.2923 34.25232, -118.2877 34.2523, -118.2864 34.25331, -118.2864 34.2…
6 (((-118.2773 34.25165, -118.276 34.25158, -118.276 34.25308, -118.2773 34.25…
7 (((-118.3212 34.24958, -118.3158 34.2486, -118.3115 34.24887, -118.3062 34.2…
8 (((-118.3644 34.2287, -118.3546 34.22852, -118.3498 34.2285, -118.3425 34.22…
9 (((-118.3547 34.22003, -118.3529 34.22141, -118.3493 34.22136, -118.345 34.2…
10 (((-118.3522 34.21382, -118.3509 34.21514, -118.3476 34.21231, -118.3446 34.…
# … with 3,102 more rows
Visualization:
library(tidyverse)
library(patchwork)
mhv_map <- ggplot(lametro_data, aes(fill = median_valueE)) +
geom_sf(color = NA) +
scale_fill_viridis_c(labels = scales::label_dollar()) +
theme_void() +
labs(fill = "Median home value ")
mhv_histogram <- ggplot(lametro_data, aes(x = median_valueE)) +
geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
bins = 100) +
theme_minimal() +
scale_x_continuous(labels = scales::label_number_si(accuracy = 0.1)) +
labs(x = "Median home value")Warning: `label_number_si()` was deprecated in scales 1.2.0.
ℹ Please use the `scale_cut` argument of `label_number()` instead.
mhv_map + mhv_histogramWarning: Removed 155 rows containing non-finite values (`stat_bin()`).
Log-transform:
library(tidyverse)
library(patchwork)
mhv_map_log <- ggplot(lametro_data, aes(fill = log(median_valueE))) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
theme_void() +
labs(fill = "Median home\nvalue (log)")
mhv_histogram_log <- ggplot(lametro_data, aes(x = log(median_valueE))) +
geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
bins = 100) +
theme_minimal() +
scale_x_continuous() +
labs(x = "Median home value (log)")
mhv_map_log + mhv_histogram_logWarning: Removed 155 rows containing non-finite values (`stat_bin()`).
Feature engineering:
library(sf)
library(units)
lametro_data_for_model <- lametro_data %>%
mutate(pop_density = as.numeric(set_units(total_populationE / st_area(.), "1/km2")),
median_structure_age = 2018 - median_year_builtE) %>%
select(!ends_with("M")) %>%
rename_with(.fn = ~str_remove(.x, "E$")) %>%
drop_na() %>%
slice(-c(572, 1939)) %>% # Catalina Island: Avalon
print(width = Inf)Simple feature collection with 2912 features and 13 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -118.9437 ymin: 33.38776 xmax: -117.4124 ymax: 34.82331
Geodetic CRS: NAD27
# A tibble: 2,912 × 14
GEOID median_value median_rooms total_population median_age
<chr> <dbl> <dbl> <dbl> <dbl>
1 06037101110 502200 4.6 3923 44.3
2 06037101122 658800 5.7 4119 52.2
3 06037101220 550800 4.1 3775 42.2
4 06037101221 461600 3.3 3787 40.5
5 06037101222 378400 3.7 2717 39.3
6 06037101300 643200 5.3 3741 52.6
7 06037101400 554300 5.2 3246 46.9
8 06037102103 658200 5 1920 37.7
9 06037102104 630200 5.5 4187 45.8
10 06037102105 545400 4.6 1797 42.2
median_year_built median_income pct_college pct_foreign_born pct_white
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1958 74625 27.8 34.8 61
2 1964 93125 34.1 34.6 76.7
3 1959 55682 26.2 44.4 43.9
4 1975 46274 24 50.9 53.5
5 1976 30016 17.1 55.7 54
6 1958 87066 33.2 45 80.4
7 1962 66210 30.9 34.6 69.5
8 1958 59005 35.7 42.8 75
9 1973 98973 41 37.1 68.8
10 1952 82438 28.4 36.4 23
percent_ooh
<dbl>
1 58.3
2 74.9
3 42.7
4 20.4
5 11.4
6 86.8
7 69.8
8 57.9
9 78.4
10 55.8
geometry
<MULTIPOLYGON [°]>
1 (((-118.2999 34.25961, -118.2999 34.26321, -118.297 34.26322, -118.2904 34.2…
2 (((-118.3024 34.27381, -118.2988 34.27526, -118.2989 34.2767, -118.2965 34.2…
3 (((-118.285 34.25227, -118.285 34.25589, -118.2841 34.25589, -118.2773 34.25…
4 (((-118.2986 34.25598, -118.2923 34.25594, -118.2901 34.25593, -118.2864 34.…
5 (((-118.2923 34.25232, -118.2877 34.2523, -118.2864 34.25331, -118.2864 34.2…
6 (((-118.2773 34.25165, -118.276 34.25158, -118.276 34.25308, -118.2773 34.25…
7 (((-118.3212 34.24958, -118.3158 34.2486, -118.3115 34.24887, -118.3062 34.2…
8 (((-118.3644 34.2287, -118.3546 34.22852, -118.3498 34.2285, -118.3425 34.22…
9 (((-118.3547 34.22003, -118.3529 34.22141, -118.3493 34.22136, -118.345 34.2…
10 (((-118.3522 34.21382, -118.3509 34.21514, -118.3476 34.21231, -118.3446 34.…
pop_density median_structure_age
<dbl> <dbl>
1 3424. 60
2 1548. 54
3 5393. 59
4 10518. 43
5 9460. 42
6 1456. 60
7 513. 56
8 1621. 60
9 2508. 45
10 3678. 66
# … with 2,902 more rows
Linear regression:
formula <- "log(median_value) ~ median_rooms + median_income + pct_college + pct_foreign_born + pct_white + median_age + median_structure_age + percent_ooh + pop_density + total_population"
model1 <- lm(formula = formula, data = lametro_data_for_model)
summary(model1)
Call:
lm(formula = formula, data = lametro_data_for_model)
Residuals:
Min 1Q Median 3Q Max
-2.68883 -0.11751 0.01886 0.13418 0.86833
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.237e+01 5.007e-02 246.954 < 2e-16 ***
median_rooms 2.019e-02 9.279e-03 2.176 0.029613 *
median_income 4.952e-06 2.918e-07 16.971 < 2e-16 ***
pct_college 9.325e-03 4.570e-04 20.406 < 2e-16 ***
pct_foreign_born 2.798e-03 4.880e-04 5.733 1.09e-08 ***
pct_white 1.050e-03 3.575e-04 2.938 0.003331 **
median_age 9.132e-03 1.027e-03 8.894 < 2e-16 ***
median_structure_age 9.255e-05 1.111e-05 8.333 < 2e-16 ***
percent_ooh -5.492e-03 4.394e-04 -12.499 < 2e-16 ***
pop_density -4.028e-06 1.925e-06 -2.092 0.036529 *
total_population -1.154e-05 3.205e-06 -3.600 0.000324 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2544 on 2901 degrees of freedom
Multiple R-squared: 0.6683, Adjusted R-squared: 0.6671
F-statistic: 584.4 on 10 and 2901 DF, p-value: < 2.2e-16
Inspect collinearity:
library(corrr)
lametro_estimates <- lametro_data_for_model %>%
select(-GEOID, -median_value, -median_year_built) %>%
st_drop_geometry()
correlations <- correlate(lametro_estimates, method = "pearson")
network_plot(correlations)Collinearity can be diagnosed further by calculating the variance inflation factor (VIF) for the model, which takes into account not just pairwise correlations but the extent to which predictors are collinear with all other predictors. A VIF value of 1 indicates no collinearity; VIF values above 5 suggest a level of collinearity that has a problematic influence on model interpretation.
library(car)
vif(model1) median_rooms median_income pct_college
5.142594 5.153376 4.194830
pct_foreign_born pct_white median_age
1.818328 3.795816 2.257856
median_structure_age percent_ooh pop_density
1.035021 5.455227 1.972569
total_population
1.085428
Exercise: Remove median_income:
formula2 <- "log(median_value) ~ median_rooms + pct_college + pct_foreign_born + pct_white + median_age + median_structure_age + percent_ooh + pop_density + total_population"
model2 <- lm(formula = formula2, data = lametro_data_for_model)
summary(model2)
Call:
lm(formula = formula2, data = lametro_data_for_model)
Residuals:
Min 1Q Median 3Q Max
-2.59893 -0.11996 0.02226 0.14036 0.85423
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.229e+01 5.226e-02 235.099 < 2e-16 ***
median_rooms 8.255e-02 8.932e-03 9.241 < 2e-16 ***
pct_college 1.386e-02 3.886e-04 35.668 < 2e-16 ***
pct_foreign_born 2.763e-03 5.115e-04 5.402 7.14e-08 ***
pct_white 1.812e-03 3.718e-04 4.874 1.15e-06 ***
median_age 6.864e-03 1.067e-03 6.432 1.46e-10 ***
median_structure_age 9.175e-05 1.164e-05 7.881 4.56e-15 ***
percent_ooh -3.967e-03 4.509e-04 -8.797 < 2e-16 ***
pop_density -2.406e-06 2.016e-06 -1.194 0.23272
total_population -9.809e-06 3.358e-06 -2.921 0.00352 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2667 on 2902 degrees of freedom
Multiple R-squared: 0.6353, Adjusted R-squared: 0.6342
F-statistic: 561.8 on 9 and 2902 DF, p-value: < 2.2e-16
vif(model2) median_rooms pct_college pct_foreign_born
4.336348 2.760267 1.818296
pct_white median_age median_structure_age
3.735957 2.219631 1.035003
percent_ooh pop_density total_population
5.226919 1.967710 1.084332
3.1 PCA
PC1 captures the gradient that represents these social differences, with which multiple demographic characteristics will be associated.
pca <- prcomp(
formula = ~.,
data = lametro_estimates,
scale. = TRUE,
center = TRUE
)
summary(pca)Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.136 1.2074 1.0424 0.92760 0.86859 0.70091 0.61376
Proportion of Variance 0.456 0.1458 0.1087 0.08604 0.07545 0.04913 0.03767
Cumulative Proportion 0.456 0.6018 0.7105 0.79654 0.87199 0.92112 0.95879
PC8 PC9 PC10
Standard deviation 0.42617 0.34845 0.33031
Proportion of Variance 0.01816 0.01214 0.01091
Cumulative Proportion 0.97695 0.98909 1.00000
PC loadings:
pca_tibble <- pca$rotation %>%
as_tibble(rownames = "predictor") %>%
print()# A tibble: 10 × 11
predictor PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 median_roo… 0.366 -0.400 0.148 -0.00847 0.0171 -0.355 1.04e-1 -0.279
2 total_popu… -0.0262 -0.413 -0.547 0.636 0.239 0.234 1.14e-1 0.0142
3 median_age 0.341 0.118 0.165 -0.212 0.455 0.472 5.14e-1 0.226
4 median_inc… 0.414 0.0368 -0.0582 0.130 0.143 -0.446 -2.19e-1 0.200
5 pct_college 0.332 0.448 -0.185 0.148 0.235 -0.101 -3.20e-1 0.366
6 pct_foreig… -0.298 -0.108 0.130 -0.169 0.785 -0.0650 -3.66e-1 -0.308
7 pct_white 0.352 0.426 -0.202 0.0924 -0.0260 0.104 7.93e-2 -0.774
8 percent_ooh 0.371 -0.401 0.194 -0.0638 0.0490 -0.153 1.66e-1 0.0226
9 pop_density -0.337 0.275 -0.125 0.0867 0.198 -0.588 6.28e-1 0.0272
10 median_str… -0.0604 0.156 0.710 0.680 0.0177 0.0616 2.69e-4 -0.0371
# … with 2 more variables: PC9 <dbl>, PC10 <dbl>
pca_tibble %>%
select(predictor:PC5) %>%
pivot_longer(PC1:PC5, names_to = "component", values_to = "value") %>%
ggplot(aes(x = value, y = predictor)) +
geom_col(fill = "darkgreen", color = "darkgreen", alpha = 0.5) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL, x = "Value") +
theme_minimal()components <- predict(pca, lametro_estimates)
lametro_pca <- lametro_data_for_model %>%
select(GEOID, median_value) %>%
cbind(components)
ggplot(lametro_pca, aes(fill = PC1)) +
geom_sf(color = NA) +
theme_void() +
scale_fill_viridis_c()The map, along with the bar chart, helps us understand how the multiple variables represent latent social processes at play in Los Angeles metro area. The brighter yellow areas, which have higher values for PC1, are located in communities like Beverley, Malibu, Rancho Palos Verdes. These communities are segregated, predominantly non-Hispanic white, and are among the wealthiest neighborhoods in the entire United States.
PC regression:
pca_formula <- paste0("log(median_value) ~ ",
paste0('PC', 1:6, collapse = ' + '))
pca_model <- lm(formula = pca_formula, data = lametro_pca)
summary(pca_model)
Call:
lm(formula = pca_formula, data = lametro_pca)
Residuals:
Min 1Q Median 3Q Max
-2.44114 -0.12239 0.01819 0.14387 0.83395
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 13.339410 0.004998 2668.984 < 2e-16 ***
PC1 0.121129 0.002341 51.748 < 2e-16 ***
PC2 0.165475 0.004140 39.968 < 2e-16 ***
PC3 -0.020855 0.004795 -4.349 1.41e-05 ***
PC4 0.059137 0.005389 10.974 < 2e-16 ***
PC5 0.116333 0.005755 20.214 < 2e-16 ***
PC6 -0.051495 0.007132 -7.220 6.59e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2697 on 2905 degrees of freedom
Multiple R-squared: 0.6266, Adjusted R-squared: 0.6259
F-statistic: 812.6 on 6 and 2905 DF, p-value: < 2.2e-16
4 Spatial regression
4.1 Test spatial correlation
lametro_data_for_model$residuals <- residuals(model2)
ggplot(lametro_data_for_model, aes(x = residuals)) +
geom_histogram(bins = 100, alpha = 0.5, color = "navy",
fill = "navy") +
theme_minimal()Moran’s I test for spatial correlation:
library(spdep)
wts <- lametro_data_for_model %>%
poly2nb() %>%
nb2listw()
moran.test(lametro_data_for_model$residuals, wts)
Moran I test under randomisation
data: lametro_data_for_model$residuals
weights: wts
Moran I statistic standard deviate = 35.823, p-value < 2.2e-16
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
0.3895827002 -0.0003435246 0.0001184783
lametro_data_for_model$lagged_residuals <- lag.listw(wts, lametro_data_for_model$residuals)
ggplot(lametro_data_for_model, aes(x = residuals, y = lagged_residuals)) +
theme_minimal() +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red")4.2 Spatial lag model
Spatial lag model: \[ Y_i = \alpha + \rho Y_{\text{lag}-i} + \sum_k \beta_k X_{ki} + \epsilon_i, \] where \(w_{ij}\) represents the spatial weights, \(\rho\) measures the effect of the spatial lag in the outcome variable, and \(k\) is the number of predictors in the model.
library(spatialreg)
lag_model <- lagsarlm(
formula = formula2,
data = lametro_data_for_model,
listw = wts
)
summary(lag_model, Nagelkerke = TRUE)
Call:lagsarlm(formula = formula2, data = lametro_data_for_model, listw = wts)
Residuals:
Min 1Q Median 3Q Max
-2.248426 -0.088715 0.014409 0.109125 0.860765
Type: lag
Coefficients: (asymptotic standard errors)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 4.3404e+00 2.0286e-01 21.3962 < 2.2e-16
median_rooms 9.1672e-02 7.1761e-03 12.7747 < 2.2e-16
pct_college 5.9231e-03 3.5537e-04 16.6672 < 2.2e-16
pct_foreign_born 1.4557e-03 4.0837e-04 3.5647 0.0003643
pct_white 1.7500e-04 3.0044e-04 0.5825 0.5602397
median_age 2.6586e-03 8.4933e-04 3.1302 0.0017469
median_structure_age 5.8519e-05 9.2648e-06 6.3163 2.678e-10
percent_ooh -2.9268e-03 3.6095e-04 -8.1086 4.441e-16
pop_density -5.9927e-06 1.6051e-06 -3.7336 0.0001888
total_population -8.4036e-06 2.6684e-06 -3.1493 0.0016366
Rho: 0.62897, LR test value: 1086.5, p-value: < 2.22e-16
Asymptotic standard error: 0.015948
z-value: 39.439, p-value: < 2.22e-16
Wald statistic: 1555.5, p-value: < 2.22e-16
Log likelihood: 265.2101 for lag model
ML residual variance (sigma squared): 0.044866, (sigma: 0.21182)
Nagelkerke pseudo-R-squared: 0.74891
Number of observations: 2912
Number of parameters estimated: 12
AIC: -506.42, (AIC for lm: 578.1)
LM test for residual autocorrelation
test value: 0.13424, p-value: 0.71407
4.3 Spatial error models
Spatial error models: \[ Y_i = \alpha + \sum_k \beta_k X_{ki} + u_i, \] where \[ u_i = \lambda u_{\text{lag}-i} + \epsilon_i \] and \[ u_{\text{lag}-i} = \sum_j w_{ij} u_j. \]
error_model <- errorsarlm(
formula = formula2,
data = lametro_data_for_model,
listw = wts
)
summary(error_model, Nagelkerke = TRUE)
Call:errorsarlm(formula = formula2, data = lametro_data_for_model,
listw = wts)
Residuals:
Min 1Q Median 3Q Max
-2.30256109 -0.08818706 -0.00055176 0.09152501 0.96055165
Type: error
Coefficients: (asymptotic standard errors)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.2543e+01 5.2188e-02 240.3439 < 2.2e-16
median_rooms 1.4035e-01 7.6875e-03 18.2575 < 2.2e-16
pct_college 6.3327e-03 4.7800e-04 13.2484 < 2.2e-16
pct_foreign_born 8.1680e-04 6.3353e-04 1.2893 0.197298
pct_white 2.6800e-03 5.0010e-04 5.3589 8.372e-08
median_age 5.1477e-04 9.2398e-04 0.5571 0.577443
median_structure_age 2.7533e-05 1.0376e-05 2.6535 0.007965
percent_ooh -3.2714e-03 3.8320e-04 -8.5370 < 2.2e-16
pop_density -1.6847e-05 1.7724e-06 -9.5049 < 2.2e-16
total_population 3.4308e-07 2.6632e-06 0.1288 0.897498
Lambda: 0.7956, LR test value: 1247, p-value: < 2.22e-16
Asymptotic standard error: 0.013441
z-value: 59.191, p-value: < 2.22e-16
Wald statistic: 3503.6, p-value: < 2.22e-16
Log likelihood: 345.4269 for error model
ML residual variance (sigma squared): 0.039582, (sigma: 0.19895)
Nagelkerke pseudo-R-squared: 0.76237
Number of observations: 2912
Number of parameters estimated: 12
AIC: NA (not available for weighted model), (AIC for lm: 578.1)
Choosing between spatial lag and spatial error models:
moran.test(lag_model$residuals, wts)
Moran I test under randomisation
data: lag_model$residuals
weights: wts
Moran I statistic standard deviate = -0.19565, p-value = 0.5776
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
-0.0024709813 -0.0003435246 0.0001182411
moran.test(error_model$residuals, wts)
Moran I test under randomisation
data: error_model$residuals
weights: wts
Moran I statistic standard deviate = -8.7047, p-value = 1
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
-0.0949433026 -0.0003435246 0.0001181065
Lagrange multiplier tests:
lm.LMtests(
model2,
wts,
test = c("LMerr", "LMlag", "RLMerr", "RLMlag")
)
Lagrange multiplier diagnostics for spatial dependence
data:
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts
LMerr = 1274.6, df = 1, p-value < 2.2e-16
Lagrange multiplier diagnostics for spatial dependence
data:
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts
LMlag = 1271, df = 1, p-value < 2.2e-16
Lagrange multiplier diagnostics for spatial dependence
data:
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts
RLMerr = 136.37, df = 1, p-value < 2.2e-16
Lagrange multiplier diagnostics for spatial dependence
data:
model: lm(formula = formula2, data = lametro_data_for_model)
weights: wts
RLMlag = 132.79, df = 1, p-value < 2.2e-16
5 Geographically weighted regression
Choose the number of nearest neighbors by CV:
library(GWmodel)
library(sf)
lametro_data_sp <- lametro_data_for_model %>%
as_Spatial()
bw <- bw.gwr(
formula = formula2,
data = lametro_data_sp,
kernel = "bisquare",
adaptive = TRUE
)Take a cup of tea and have a break, it will take a few minutes.
-----A kind suggestion from GWmodel development group
Adaptive bandwidth: 1807 CV score: 160.4359
Adaptive bandwidth: 1125 CV score: 148.0399
Adaptive bandwidth: 702 CV score: 137.977
Adaptive bandwidth: 442 CV score: 148.5875
Adaptive bandwidth: 864 CV score: 142.1575
Adaptive bandwidth: 603 CV score: 134.9265
Adaptive bandwidth: 540 CV score: 134.1407
Adaptive bandwidth: 503 CV score: 139.8375
Adaptive bandwidth: 565 CV score: 134.046
Adaptive bandwidth: 578 CV score: 134.271
Adaptive bandwidth: 554 CV score: 133.8688
Adaptive bandwidth: 550 CV score: 133.9889
Adaptive bandwidth: 559 CV score: 133.9599
Adaptive bandwidth: 553 CV score: 133.8772
Adaptive bandwidth: 556 CV score: 133.9004
Adaptive bandwidth: 553 CV score: 133.8772
Adaptive bandwidth: 554 CV score: 133.8688
Fitting and evaluating the GWR model
formula2 <- "log(median_value) ~ median_rooms + pct_college + pct_foreign_born + pct_white + median_age + median_structure_age + percent_ooh + pop_density + total_population"
gw_model <- gwr.basic(
formula = formula2,
data = lametro_data_sp,
bw = bw,
kernel = "bisquare",
adaptive = TRUE
)names(gw_model)[1] "GW.arguments" "GW.diagnostic" "lm" "SDF"
[5] "timings" "this.call" "Ftests"
gw_model_results <- gw_model$SDF %>%
st_as_sf()
names(gw_model_results) [1] "Intercept" "median_rooms"
[3] "pct_college" "pct_foreign_born"
[5] "pct_white" "median_age"
[7] "median_structure_age" "percent_ooh"
[9] "pop_density" "total_population"
[11] "y" "yhat"
[13] "residual" "CV_Score"
[15] "Stud_residual" "Intercept_SE"
[17] "median_rooms_SE" "pct_college_SE"
[19] "pct_foreign_born_SE" "pct_white_SE"
[21] "median_age_SE" "median_structure_age_SE"
[23] "percent_ooh_SE" "pop_density_SE"
[25] "total_population_SE" "Intercept_TV"
[27] "median_rooms_TV" "pct_college_TV"
[29] "pct_foreign_born_TV" "pct_white_TV"
[31] "median_age_TV" "median_structure_age_TV"
[33] "percent_ooh_TV" "pop_density_TV"
[35] "total_population_TV" "Local_R2"
[37] "geometry"
ggplot(gw_model_results, aes(fill = Local_R2)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
theme_void()Percentage owner-occupied housing:
ggplot(gw_model_results, aes(fill = percent_ooh)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
theme_void() +
labs(fill = "Local β for \npercent_ooh")Population density:
ggplot(gw_model_results, aes(fill = pop_density)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
theme_void() +
labs(fill = "Local β for \npopulation density")6 Geodemographic classification
6.1 Aspatial K-means
set.seed(1983)
lametro_kmeans <- lametro_pca %>%
st_drop_geometry() %>%
select(PC1:PC8) %>%
kmeans(centers = 6)
table(lametro_kmeans$cluster)
1 2 3 4 5 6
738 144 380 455 612 583
lametro_clusters <- lametro_pca %>%
mutate(cluster = as.character(lametro_kmeans$cluster))
ggplot(lametro_clusters, aes(fill = cluster)) +
geom_sf(size = 0.1) +
scale_fill_brewer(palette = "Set1") +
theme_void() +
labs(fill = "Cluster ")PCA plot. PC1, which is a gradient from affluent/older/white to lower-income/younger/nonwhite, and PC2, which represents areas with high population densities and educational attainment on the low end to lower-density, less educated areas on the high end.
library(plotly)
cluster_plot <- ggplot(lametro_clusters,
aes(x = PC1, y = PC2, color = cluster)) +
geom_point() +
scale_color_brewer(palette = "Set1") +
theme_minimal()
ggplotly(cluster_plot) %>%
layout(legend = list(orientation = "h", y = -0.15,
x = 0.2, title = "Cluster"))6.2 SKATER
In other applications, an analyst may want to generate meaningful clusters that are constrained to be neighboring or contiguous areas. An application of this workflow might be sales territory generation, where sales representatives will be assigned to communities in which they have local market knowledge but also want to minimize overall travel time.
SKATER (Spatial ’K’luster Analysis by Tree Edge Removal) algorithm:
library(spdep)
library(bigDM)
input_vars <- lametro_pca %>%
select(PC1:PC8) %>%
st_drop_geometry() %>%
as.data.frame()
skater_nbrs <- poly2nb(lametro_pca, queen = TRUE) # 2 disjoint connected subgraphs
skater_nbrs_mod <- connect_subgraphs(carto = lametro_pca, ID.area = "GEOID", nb = skater_nbrs)$nbSearching for isolated areas:
0 region(s) with no links
Searching for disjoint connected subgraphs:
2 disjoint connected subgraphs
costs <- nbcosts(skater_nbrs_mod, input_vars)
skater_weights <- nb2listw(skater_nbrs_mod, costs, style = "B")mst <- mstree(skater_weights)
regions <- skater(
mst[, 1:2],
input_vars,
ncuts = 8,
crit = 10
)lametro_clusters$region <- as.character(regions$group)
ggplot(lametro_clusters, aes(fill = region)) +
geom_sf(size = 0.1) +
scale_fill_brewer(palette = "Set1") +
theme_void()